1 STRATAA microbiome analysis

1.1 Imports

library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)

1.2 Sources

The file handles are set in config.R as they’re used by both this script and data_cleaning.

source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")

1.3 Metadata basics

First do the plots with kruskall wallis comparison between groups to see if there’s an overall difference, then do the pairwise tests because KW says there is a difference.

metadata <- read_metadata(metadata_handle)

metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())

number_per_country %>% kbl() %>% kable_styling()
Country count
Bangladesh 120
Malawi 114
Nepal 76
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
Group count
Acute typhoid 103
Carrier 110
Healthy Control 97
baseline_chars <- get_baseline_characteristics(metadata)

# baseline_chars$pct_anti
# baseline_chars$pct_anti %>% na_if(0)


# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()

# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') 

baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
Country variable_name Acute typhoid Carrier Healthy Control
Bangladesh Median age 6.0 37.0 28.5
Bangladesh Women (%) 47.5 47.5 65.0
Bangladesh Antibiotics in last 2 weeks (%) 37.5 NA NA
Malawi Median age 9.7 32.0 24.0
Malawi Women (%) 64.7 57.5 65.0
Malawi Antibiotics in last 2 weeks (%) 26.5 NA NA
Nepal Median age 17.2 43.9 35.0
Nepal Women (%) 24.1 66.7 82.4
Nepal Antibiotics in last 2 weeks (%) 44.8 NA NA
# the kruskal wallis test is positive, showing a difference between the groups, so we move onto pairwise tests.
# ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")
comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n()) 

plot_sex <- function(eg1, c){
  d <- eg1 %>% filter(Country == c)
  p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) + 
    geom_bar(stat ='identity', position = 'fill') + 
    ylab('Proportion') + 
    ggtitle(c)# +
    #theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
  
  return(p)
}

m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex 

1.4 read in metaphlan data

strataa_metaphlan_data <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.all_strataa_metaphlan.csv'), header= TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE, check.names=FALSE)
strataa_metaphlan_data$lowest_taxonomic_level <- sapply(str_split(row.names(strataa_metaphlan_data), "\\|"), function(x) x[length(x)])

strataa_metaphlan_metadata <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.strataa_metadata.metaphlan.csv'), header = TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE)
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(SampleID = rownames(strataa_metaphlan_metadata))
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(age_bracket=cut(Age, breaks=c(0, 1, 5, 15, Inf), labels=c("<1", "1-5", "6-15", ">15")))

1.5 all countries, all groups

1.5.1 Alpha diversity - metaphlan

Alpha diversity - all countries, all groups

all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))

all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 4 7.82 1.96 11.7 9.31e-09 significant
Group 2 5.92 2.96 17.6 6.23e-08 significant
Country 2 2.75 1.38 8.2 0.000348 significant
Country:Sex 2 1.15 0.575 3.42 0.034 not_significant
Sex:Group 2 1.08 0.539 3.21 0.0417 not_significant
Country:Sex:Age 2 1.07 0.534 3.18 0.0431 not_significant
Country:Age 2 0.61 0.305 1.82 0.164 not_significant
Country:Sex:Group 4 1.08 0.27 1.61 0.172 not_significant
Sex:Age 1 0.308 0.308 1.84 0.177 not_significant
Country:Group:Age 4 0.834 0.209 1.24 0.293 not_significant
Group:Age 2 0.228 0.114 0.68 0.507 not_significant
Sex 1 0.0294 0.0294 0.175 0.676 not_significant
Sex:Group:Age 2 0.101 0.0506 0.302 0.74 not_significant
Country:Sex:Group:Age 4 0.0673 0.0168 0.1 0.982 not_significant
Age 1 1.07e-08 1.07e-08 6.36e-08 1 not_significant
Residuals 274 46 0.168 NA NA NA
all_countries_all_groups_alpha$alpha_plot_group

all_countries_all_groups_alpha$alpha_plot_antibiotics

all_countries_all_groups_alpha$alpha_by_country

### Beta diversity - metaphlan

All groups.

all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0759082 0.0009990 significant
Group:Age 0.0123544 0.0049950 significant
Sex:Group:Age 0.0081926 0.1058941 not_significant
Sex:Group 0.0079745 0.1118881 not_significant
Age 0.0038070 0.1698302 not_significant
Sex 0.0035891 0.2117882 not_significant
Sex:Age 0.0037273 0.2247752 not_significant
Total 1.0000000 NA NA
Residual 0.8844470 NA NA
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.2679584 0.0009990 significant
Sex:Age 0.0094338 0.1098901 not_significant
Age 0.0084979 0.1838162 not_significant
Sex 0.0078067 0.2047952 not_significant
Sex:Group 0.0155854 0.2147852 not_significant
Group:Age 0.0114756 0.5074925 not_significant
Sex:Group:Age 0.0097058 0.6913087 not_significant
Total 1.0000000 NA NA
Residual 0.6695363 NA NA
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34

mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1942771 0.0009990 significant
Sex:Group 0.0231539 0.0289710 not_significant
Sex:Age 0.0138745 0.0289710 not_significant
Group:Age 0.0232750 0.0369630 not_significant
Age 0.0114622 0.0609391 not_significant
Sex 0.0075054 0.3236763 not_significant
Sex:Group:Age 0.0125237 0.5934066 not_significant
Total 1.0000000 NA NA
Residual 0.7139281 NA NA
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34

nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0898195 0.0009990 significant
Sex 0.0176933 0.1118881 not_significant
Sex:Age 0.0150035 0.2247752 not_significant
Age 0.0125478 0.4445554 not_significant
Sex:Group 0.0239139 0.5294705 not_significant
Sex:Group:Age 0.0175838 0.8611389 not_significant
Group:Age 0.0164764 0.9410589 not_significant
Total 1.0000000 NA NA
Residual 0.8069619 NA NA
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12

1.6 Healthy vs Typhi

1.6.1 Plot phyla bar plots

phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")
metadata_for_phyla_plots <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)

phyla_clean_metadata <- prep_data_to_plot_phyla(phyla, strataa_metaphlan_metadata)

order_of_groups <- c("Typhi", "Healthy")
bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_for_phyla_plots, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from =  'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
clade_name Bangladesh_Acute_Typhi Bangladesh_Control_HealthySerosurvey Malawi_Acute_Typhi Malawi_Control_HealthySerosurvey Nepal_Acute_Typhi Nepal_Control_HealthySerosurvey
k__Archaea|p__Candidatus_Thermoplasmatota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Archaea|p__Euryarchaeota 0.000000 0.000000 0.249010 0.000000 0.00000 0.22474
k__Archaea|p__Thaumarchaeota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Acidobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Actinobacteria 14.361185 13.136440 1.893725 0.524965 3.33634 3.81550
k__Bacteria|p__Bacteria_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Bacteroidetes 0.911970 2.216730 15.361165 34.630185 28.24679 20.53695
k__Bacteria|p__Candidatus_Melainabacteria 0.000000 0.000000 0.038085 0.052210 0.00000 0.02157
k__Bacteria|p__Candidatus_Saccharibacteria 0.017310 0.015625 0.000000 0.000000 0.00000 0.00203
k__Bacteria|p__Chloroflexi 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Elusimicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Firmicutes 80.670075 72.161565 60.161350 58.311770 53.74090 60.17250
k__Bacteria|p__Fusobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Lentisphaerae 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Proteobacteria 0.629715 1.272295 6.606440 6.195335 2.90174 2.52509
k__Bacteria|p__Spirochaetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Synergistetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Tenericutes 0.000000 0.000000 0.002715 0.000000 0.01965 0.01362
k__Bacteria|p__Verrucomicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Ascomycota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Eukaryota_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000

1.6.2 alpha diversity

Alpha diversity - all countries, healthy and acute

all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

# all_countries_healthy_acute_alpha$alpha_by_country
all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country 2 5.74 2.87 17.4 1.36e-07 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.68 0.842 5.12 0.00699 significant
Sex:Age 1 1.07 1.07 6.5 0.0117 not_significant
Country:Sex 2 1.5 0.75 4.56 0.0119 not_significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.39 0.693 4.21 0.0165 not_significant
Country:Age 2 1.02 0.51 3.1 0.0477 not_significant
Sex:Group 1 0.628 0.628 3.82 0.0525 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.816 0.408 2.48 0.0871 not_significant
Country:Group 2 0.698 0.349 2.12 0.123 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.618 0.309 1.88 0.157 not_significant
Group 1 0.164 0.164 0.997 0.319 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.359 0.179 1.09 0.339 not_significant
Sex 1 0.0592 0.0592 0.36 0.55 not_significant
Country:Sex:Age 2 0.188 0.0938 0.57 0.567 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0511 0.0511 0.31 0.578 not_significant
Sex:Group:Age 1 0.0368 0.0368 0.224 0.637 not_significant
Group:Age 1 0.0147 0.0147 0.0891 0.766 not_significant
Country:Group:Age 2 0.0713 0.0357 0.217 0.805 not_significant
Age 1 0.00772 0.00772 0.0469 0.829 not_significant
Country:Sex:Group 2 0.0595 0.0297 0.181 0.835 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.00675 0.00675 0.041 0.84 not_significant
Country:Sex:Group:Age 2 0.05 0.025 0.152 0.859 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000828 0.977 not_significant
Residuals 163 26.8 0.165 NA NA NA
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only

malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 2.27 2.27 16 0.000167 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 1 1.31 1.31 9.27 0.0034 significant
Sex 1 1.16 1.16 8.21 0.00567 significant
Group 1 0.84 0.84 5.93 0.0177 not_significant
Age 1 0.451 0.451 3.19 0.079 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.406 0.406 2.87 0.0952 not_significant
Sex:Group 1 0.313 0.313 2.21 0.142 not_significant
Sex:Group:Age 1 0.0978 0.0978 0.691 0.409 not_significant
Sex:Age 1 0.0164 0.0164 0.116 0.734 not_significant
Group:Age 1 0.00122 0.00122 0.00864 0.926 not_significant
Residuals 63 8.91 0.141 NA NA NA
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

1.6.3 beta diversity

Acute vs healthy.

all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0285252 0.0009990 significant
Group:Age 0.0117687 0.0059940 significant
Age 0.0092014 0.0399600 not_significant
Sex 0.0070879 0.1178821 not_significant
Sex:Group:Age 0.0072077 0.1268731 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0117438 0.2057942 not_significant
Sex:Age 0.0058394 0.2337662 not_significant
Sex:Group 0.0057923 0.2627373 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0055130 0.2757243 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0043096 0.5254745 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0070480 0.8391608 not_significant
Total 1.0000000 NA NA
Residual 0.8959630 NA NA
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0782718 0.0009990 significant
Sex 0.0230592 0.0379620 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0199564 0.0629371 not_significant
Sex:Age 0.0168558 0.1398601 not_significant
Age 0.0146689 0.2367632 not_significant
Group:Age 0.0109377 0.4795205 not_significant
Sex:Group:Age 0.0100500 0.5384615 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0080933 0.6863137 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0074906 0.8311688 not_significant
Sex:Group 0.0067483 0.9140859 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0059228 0.9230769 not_significant
Total 1.0000000 NA NA
Residual 0.7979452 NA NA
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1995900 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0371210 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0381931 0.0029970 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0267524 0.0169830 not_significant
Sex:Group 0.0200496 0.0279720 not_significant
Age 0.0194908 0.0319680 not_significant
Sex 0.0141068 0.1228771 not_significant
Sex:Age 0.0144797 0.1328671 not_significant
Sex:Group:Age 0.0091754 0.4915085 not_significant
Group:Age 0.0078947 0.6113886 not_significant
Total 1.0000000 NA NA
Residual 0.6131466 NA NA
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0598990 0.0019980 significant
Sex 0.0484279 0.0149850 not_significant
Sex:Group 0.0246755 0.3126873 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0244159 0.3196803 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0450942 0.4175824 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0207951 0.4925075 not_significant
Sex:Group:Age 0.0175717 0.6813187 not_significant
Age 0.0154038 0.7932068 not_significant
Sex:Age 0.0148604 0.8301698 not_significant
Group:Age 0.0102689 0.9900100 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0188359 0.9980020 not_significant
Total 1.0000000 NA NA
Residual 0.6997518 NA NA
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

1.6.4 maaslin2 taxonomy

Maaslin basics

groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0

There were 296 species significantly (q-val < 0.05) associated with health/disease in Malawi, in Bangladesh, and in Nepal.

Combine the taxonomic maaslins, and print out the species that are sig in both and share directions.

Because sequencing run and participant type are totally confounded for Bangladesh, need to exclude sequencing run from the final model for Bangladesh (otherwise, wipes out the signals).

associated at both sites

bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# View(combined_results$positive_coef)
# View(combined_results$negative_coef)


combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179 s__Clostridium_SGB6179 8.79 1.34 4.25e-05 3.11 0.936 0.0286
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A s__Prevotella_copri_clade_A 4.54 0.978 0.00971 7.64 1.25 1.21e-05
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809 s__GGB4266_SGB5809 4.58 1.09 0.0147 6.94 0.981 4.38e-07
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae s__Haemophilus_parainfluenzae 3.64 0.979 0.03 6.92 0.953 2.26e-07
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis s__Romboutsia_timonensis 4.52 1.25 0.0386 5.07 0.903 5.91e-05
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium s__Lachnospiraceae_bacterium -3.05 0.838 0.0366 -2.87 0.785 0.0135
# todo - refactoring the run_combine_maaslins means that we dont get the species that are only associated at one site. need to fix that.

# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)

There were species significantly (q-val < 0.05) associated with health/disease in Malawi only and in Bangladesh only.

The ones associated at only one site are written out to a file, you can look at them manually there.

1.6.5 plots of species of interest

strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))


pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb

1.6.6 P. copri clades

Print the results for all the P. copri clades

# all_taxa_maaslin <- combined_maaslins$all_features
# p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
# write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))

1.6.7 maaslin functional groups

Functional groups associated with health

bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')

bang_func_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_maaslin, mal_func_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

bang_mal <- list(bang_func_maaslin, mal_func_maaslin)
combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)


combined_results$positive_coef %>%
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
MGC_class Species feature Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
succinate2propionate Dialister_invisus_DSM_15470_genomic_scaffold gb.GG698602.1.region002.GC_DNA..Entryname.succinate2propionate..OS.Dialister_invisus_DSM_15470_genomic_scaffold..SMASHregion.region002..NR.1 4.49 0.769 0.000238 4.32 0.937 0.00196
Pyruvate2acetate.formate Prevotella_copri_strain_AM22.19 gb.QRIF01000001.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.7 4.95 0.865 0.000259 4.76 1.23 0.0122
TPP_AA_metabolism Prevotella_sp._S7_MS_2 gb.JRPT01000001.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Prevotella_sp._S7_MS_2..SMASHregion.region001..NR.11..BG.2 4.86 0.89 0.000381 5.14 1.16 0.00314
Arginine2_Hcarbonate .Clostridium._sordellii_genome_assembly gb.LN679998.1.region007.GC_DNA..Entryname.Arginine2_Hcarbonate..OS..Clostridium._sordellii_genome_assembly..SMASHregion.region007..NR.2 4.59 0.824 0.000381 3.51 0.732 0.00116
Others_HGD_unassigned Prevotella_copri_strain_AF24.12 gb.QRVA01000039.1.region001.GC_DNA..Entryname.Others_HGD_unassigned..OS.Prevotella_copri_strain_AF24.12..SMASHregion.region001..NR.6 4.72 0.862 0.000381 5.74 1.09 0.000359
Pyruvate2acetate.formate Prevotella_sp._AM34.19LB gb.QSIG01000002.1.region002.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_sp._AM34.19LB..SMASHregion.region002..NR.3 4.25 0.783 0.000381 3.28 0.956 0.0336
TPP_fatty_acids Prevotella_copri_strain_AM22.19 gb.QRIF01000003.1.region001.GC_DNA..Entryname.TPP_fatty_acids..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.6 5.07 0.952 0.000422 5.33 1.07 0.000706
Rnf_complex Prevotella_copri_strain_AF38.11 gb.QROP01000022.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Prevotella_copri_strain_AF38.11..SMASHregion.region001..NR.6 4.74 0.914 0.000649 5.23 1.12 0.00176
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region001..NR.1 4.84 0.983 0.00104 2.98 0.719 0.00629
Others_HGD_unassigned Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region003.GC_DNA..Entryname.Others_HGD_unassigned..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region003..NR.1 4.59 0.935 0.00105 2.76 0.558 0.000812
porA Prevotella_copri_strain_AF10.17 gb.QSAV01000013.1.region001.GC_DNA..Entryname.porA..OS.Prevotella_copri_strain_AF10.17..SMASHregion.region001..NR.6 4.46 0.918 0.00124 5.08 1.17 0.00387
Rnf_complex Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region002.GC_DNA..Entryname.Rnf_complex..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region002..NR.1 4.36 0.957 0.0031 2.83 0.622 0.00228
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region004.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region004..NR.1 4.39 0.968 0.00318 3.37 0.814 0.00646
Formate_dehydrogenase Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837963.1.region001.GC_DNA..Entryname.Formate_dehydrogenase..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.41 0.754 0.00327 4.89 0.985 0.000767
Pyruvate2acetate.formate.Acetyl.CoA_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region005.GC_DNA..Entryname.Pyruvate2acetate.formate.Acetyl.CoA_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region005..NR.1 4.26 1 0.0072 4.58 1.11 0.00659
Pyruvate2acetate.formate Clostridium_carboxidivorans gb.CP011803.1.region013.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_carboxidivorans..SMASHregion.region013..NR.1 3.69 0.89 0.00954 3.11 0.762 0.00733
OD_eut_pdu_related.PFOR_II_pathway Paeniclostridium_sordellii_strain_AM370 gb.CP014150.1.region011.GC_DNA..Entryname.OD_eut_pdu_related.PFOR_II_pathway..OS.Paeniclostridium_sordellii_strain_AM370..SMASHregion.region011..NR.4 2.61 0.629 0.00954 3.94 0.574 4.58e-06
TPP_AA_metabolism.Arginine2putrescine Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837955.1.region001.GC_DNA..Entryname.TPP_AA_metabolism.Arginine2putrescine..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.93 0.951 0.00959 4.39 0.918 0.0012
Respiratory_glycerol Haemophilus_parainfluenzae_HK2019 gb.AJTC01000042.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Haemophilus_parainfluenzae_HK2019..SMASHregion.region001..NR.5 3.55 0.882 0.0136 5.14 0.975 0.000344
Pyruvate2acetate.formate Haemophilus_parainfluenzae_HK262 gb.AJMW01000041.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.5 3.17 0.791 0.0137 5.08 1.01 0.000657
TPP_AA_metabolism Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region002.GC_DNA..Entryname.TPP_AA_metabolism..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region002..NR.1 3.24 0.813 0.0138 2.5 0.72 0.0303
Arginine2putrescine.Putrescine2spermidine Romboutsia_sp._Frifi_strain_FRIFI_genome gb.LN650648.1.region002.GC_DNA..Entryname.Arginine2putrescine.Putrescine2spermidine..OS.Romboutsia_sp._Frifi_strain_FRIFI_genome..SMASHregion.region002..NR.1 2.66 0.683 0.0177 2.57 0.523 0.000868
Pyruvate2acetate.formate Haemophilus_haemolyticus_HK386 gb.AJSV01000030.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_haemolyticus_HK386..SMASHregion.region001..NR.2 2.38 0.631 0.0234 5.02 0.724 3.95e-06
Pyruvate2acetate.formate Haemophilus_aegyptius_ATCC_11116_genomic_scaffold gb.GL878527.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_aegyptius_ATCC_11116_genomic_scaffold..SMASHregion.region001..NR.2 2.16 0.574 0.0236 3.52 0.679 0.000428
acetate2butyrate Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region002.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region002..NR.5..BG.2 2.42 0.646 0.0245 2.53 0.754 0.0396
OD_fatty_acids Dialister_succinatiphilus_YIT_11850_genomic_scaffold gb.JH591188.1.region001.GC_DNA..Entryname.OD_fatty_acids..OS.Dialister_succinatiphilus_YIT_11850_genomic_scaffold..SMASHregion.region001..NR.1 2.94 0.793 0.0265 2.54 0.579 0.00342
Pyruvate2acetate.formate Clostridium_butyricum_strain_JKY6D1_chromosome gb.CP013352.1.region004.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_butyricum_strain_JKY6D1_chromosome..SMASHregion.region004..NR.6 3.46 0.938 0.0265 3.35 0.936 0.0241
TPP_AA_metabolism Haemophilus_sputorum_HK_2154 gb.ALJP01000004.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_sputorum_HK_2154..SMASHregion.region001..NR.1 2.96 0.812 0.0299 4.52 0.866 0.000392
Fumarate2succinate Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV838003.1.region001.GC_DNA..Entryname.Fumarate2succinate..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 3.34 0.924 0.0312 4.62 1.01 0.0022
Respiratory_glycerol Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold gb.KE952617.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold..SMASHregion.region001..NR.1 2.91 0.808 0.0323 4.64 0.926 0.000678
TPP_AA_metabolism Haemophilus_pittmaniae_HK_85 gb.AFUV01000010.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_pittmaniae_HK_85..SMASHregion.region001..NR.1 2.43 0.686 0.0353 5.22 0.896 8.17e-05
PFOR_II_pathway Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291660.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 3.51 0.994 0.0353 3.77 0.982 0.0132
acetate2butyrate Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region001.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 3.41 0.981 0.0397 3.41 0.931 0.0197
Rnf_complex Haemophilus_parainfluenzae_HK262 gb.AJMW01000064.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.6..BG.2 2.8 0.814 0.0418 5 0.992 0.000636
OD_fatty_acids.PFOR_II_pathway Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region005.GC_DNA..Entryname.OD_fatty_acids.PFOR_II_pathway..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region005..NR.5..BG.2 2.57 0.772 0.0497 3.16 0.835 0.0148
combined_results$negative_coef %>%
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
MGC_class Species feature Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
NADH_dehydrogenase_I Actinomyces_viscosus_C505_genomic_scaffold gb.KI391968.1.region002.GC_DNA..Entryname.NADH_dehydrogenase_I..OS.Actinomyces_viscosus_C505_genomic_scaffold..SMASHregion.region002..NR.5 -3.01 0.574 0.000568 -2.9 0.796 0.0205
Nitrate_reductase Actinomyces_johnsonii_F0510_genomic_scaffold gb.KE951633.1.region001.GC_DNA..Entryname.Nitrate_reductase..OS.Actinomyces_johnsonii_F0510_genomic_scaffold..SMASHregion.region001..NR.2 -2.76 0.544 0.000832 -2.26 0.657 0.0332
Rnf_complex.TPP_AA_metabolism Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic gb.DS499548.1.region001.GC_DNA..Entryname.Rnf_complex.TPP_AA_metabolism..OS.Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic..SMASHregion.region001..NR.2 -2.44 0.727 0.047 -4.29 1.23 0.0298
combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
MGC_class n Species
Pyruvate2acetate.formate 7 Clostridium butyricum, Clostridium carboxidivorans, Haemophilus aegyptius, Haemophilus haemolyticus, Haemophilus parainfluenzae, Prevotella copri, Prevotella sp.
TPP_AA_metabolism 4 Clostridium celatum, Haemophilus pittmaniae, Haemophilus sputorum, Prevotella sp.
PFOR_II_pathway 3 Clostridium celatum, Romboutsia ilealis, Romboutsia ilealis
Rnf_complex 3 Haemophilus parainfluenzae, Prevotella copri, Romboutsia ilealis
Others_HGD_unassigned 2 Prevotella copri, Romboutsia ilealis
Respiratory_glycerol 2 Aggregatibacter sp., Haemophilus parainfluenzae
acetate2butyrate 2 Clostridium botulinum, Clostridium celatum
Arginine2_Hcarbonate 1 .Clostridium. sordellii
Arginine2putrescine.Putrescine2spermidine 1 Romboutsia sp.
Formate_dehydrogenase 1 Haemophilus sp.
Fumarate2succinate 1 Haemophilus sp.
OD_eut_pdu_related.PFOR_II_pathway 1 Paeniclostridium sordellii
OD_fatty_acids 1 Dialister succinatiphilus
OD_fatty_acids.PFOR_II_pathway 1 Clostridium botulinum
Pyruvate2acetate.formate.Acetyl.CoA_pathway 1 Romboutsia ilealis
TPP_AA_metabolism.Arginine2putrescine 1 Haemophilus sp.
TPP_fatty_acids 1 Prevotella copri
porA 1 Prevotella copri
succinate2propionate 1 Dialister invisus
combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
MGC_class n Species
NADH_dehydrogenase_I 1 Actinomyces viscosus
Nitrate_reductase 1 Actinomyces johnsonii
Rnf_complex.TPP_AA_metabolism 1 Eubacterium siraeum

1.7 healthy vs carrier

1.7.1 phylum plots

order_of_groups <- c("Carrier", "Healthy")
# bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

# bangladesh_phyla_plot / 
malawi_phyla_plot / nepal_phyla_plot

1.7.2 alpha diversity

Alpha diversity - all countries, healthy and carrier

all_countries_healthy_carrier_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi', 'Nepal'), groups_of_interest = c('Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Carrier', 'Control_HealthySerosurvey')))
all_countries_healthy_carrier_alpha$alpha_by_country

all_countries_healthy_carrier_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 1 4.82 4.82 40.9 3.97e-09 significant
Group 1 1.41 1.41 11.9 0.000786 significant
Group:Age 1 0.639 0.639 5.42 0.0217 not_significant
Country:Sex:Age 1 0.467 0.467 3.96 0.0491 not_significant
Country 1 0.438 0.438 3.72 0.0564 not_significant
Sex:Group 1 0.35 0.35 2.97 0.0875 not_significant
Age 1 0.0966 0.0966 0.819 0.367 not_significant
Country:Sex:Group 1 0.0738 0.0738 0.626 0.43 not_significant
Sex:Age 1 0.0673 0.0673 0.571 0.451 not_significant
Country:Group:Age 1 0.0151 0.0151 0.128 0.721 not_significant
Country:Sex 1 0.0139 0.0139 0.118 0.732 not_significant
Sex 1 0.00909 0.00909 0.0771 0.782 not_significant
Country:Age 1 0.00766 0.00766 0.065 0.799 not_significant
Country:Sex:Group:Age 1 0.00184 0.00184 0.0156 0.901 not_significant
Sex:Group:Age 1 6.12e-06 6.12e-06 5.19e-05 0.994 not_significant
Residuals 111 13.1 0.118 NA NA NA
all_countries_healthy_carrier_alpha$alpha_plot_group

1.7.3 beta diversity

Carrier vs healthy.

all_countries_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi', 'Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0653601 0.0009990 significant
Sex:Age 0.0141431 0.0299700 not_significant
Group:Age 0.0127135 0.0419580 not_significant
Age 0.0090662 0.2267732 not_significant
Sex:Group 0.0077753 0.3616384 not_significant
Sex 0.0072119 0.4595405 not_significant
Sex:Group:Age 0.0068119 0.5044955 not_significant
Total 1.0000000 NA NA
Residual 0.8769180 NA NA
# bgd_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Carrier', 'Control_HealthySerosurvey'))
# bgd_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()

mal_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Carrier', 'Control_HealthySerosurvey'))
mal_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1999878 0.0009990 significant
Sex:Age 0.0304467 0.0049950 significant
Age 0.0187062 0.0569431 not_significant
Group:Age 0.0147510 0.1398601 not_significant
Sex:Group 0.0118411 0.2587413 not_significant
Sex 0.0099215 0.3936064 not_significant
Sex:Group:Age 0.0046358 0.9180819 not_significant
Total 1.0000000 NA NA
Residual 0.7097099 NA NA
nep_beta_carrier_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Carrier', 'Control_HealthySerosurvey'))
nep_beta_carrier_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0836999 0.0009990 significant
Sex:Group 0.0241616 0.2667333 not_significant
Sex:Age 0.0240379 0.2747253 not_significant
Age 0.0195876 0.4765235 not_significant
Sex 0.0174225 0.5934066 not_significant
Sex:Group:Age 0.0149025 0.7772228 not_significant
Group:Age 0.0098712 0.9760240 not_significant
Total 1.0000000 NA NA
Residual 0.8063168 NA NA
all_countries_beta_carrier_healthy$pc12 + mal_beta_carrier_healthy$pc12 + nep_beta_carrier_healthy$pc12

1.7.4 maaslin taxonomic groups

groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age")
# bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
# bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
# bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
# / bangladesh_maaslin_stats$volcano_plot 
malawi_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 117
# nrow(bangladesh_maaslin_stats$maaslin_results_sig)
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 1

We’re not including the bangladesh samples in this analysis, because the bangladesh carriers were processed differently (extracted without being frozen).

There are no species significantly associated with carrier status in all Mal and Nep, so not including the combine.

# combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# only one species significantly associated in Nepal.
# there are no species consistently associated across all three sites
# bang_mal_nep <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal_nep, c('_bang', '_mwi', '_nep'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# there are no species associated in both bang and nep
# bang_nep <- list(bangladesh_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_bang', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# there are no species associated in both mal and nep
# mal_nep <- list(malawi_taxonomic_maaslin_filtered, nepal_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_nep, c('_mal', '_nep'), bang_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
# View(combined_results$positive_coef)
# View(combined_results$negative_coef)

# bang_mal <- list(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

# # positive is associated with health
# combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()

# # negative is associated with carrier
# combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()

# nrow(combined_results$mwi_maaslin_only)
# nrow(combined_results$bang_maaslin_only)

1.7.5 functional groups

# bang_variables_for_analysis <- c("Group", "Sex", "Age")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')

# bang_func_carr_health_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_carr_health_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
# combined_results <- run_combine_maaslins(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

# bang_mal <- list(bang_func_carr_health_maaslin, mal_func_carr_health_maaslin)
# combined_results <- run_combine_maaslins(bang_mal, c('_bang', '_mal'), mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

# # positive is associated with health
# # combined_results$positive_coef %>%  kbl() %>% kable_styling()
# combined_results$positive_coef %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()
# # negative is associated with carrier
# # combined_results$negative_coef %>%  kbl() %>% kable_styling()
# combined_results$negative_coef %>% 
#   select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
#   rename(`Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
#   dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
#   kbl() %>% kable_styling()


# # print out a nicely formatted table of the count of the MGCs
# combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
# combined_results$negative_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()